{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__title__      = ML in Action Chapter7 Code(AdaBoost)\n",
    "__author__     = wgj\n",
    "__createDate__ = 2018-11-01 21:10:33"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "from numpy import *\n",
    "\n",
    "\"\"\" 加载数据集 \"\"\"\n",
    "def loadDataSet(fileName): \n",
    "    \"\"\" \n",
    "    输入：fileName——文件名\n",
    "    输出：1、dataMat——特征矩阵（m行xn列）\n",
    "                 2、labelMat——标签向量（m行x1列）\n",
    "    \"\"\"\n",
    "    numFeat = len(open(fileName).readline().split('\\t'))      #  数据集列数\n",
    "    dataMat = []; labelMat = []\n",
    "    fr = open(fileName)\n",
    "    for line in fr.readlines():\n",
    "        lineArr =[]                                                  # 保存一行训练数据\n",
    "        curLine = line.strip().split('\\t')              # 移除每行首尾空格并用tab分割数据\n",
    "        for i in range(numFeat-1):                   # 每行22个数，前21个为特征\n",
    "            lineArr.append(float(curLine[i]))\n",
    "        dataMat.append(lineArr)\n",
    "        labelMat.append(float(curLine[-1]))\n",
    "    return dataMat,labelMat\n",
    "\n",
    "\"\"\" 树桩分类 \"\"\"\n",
    "def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):\n",
    "    \"\"\" \n",
    "    输入：1、dataMatrix——特征矩阵（m行xn列）\n",
    "                 2、dimen——特征索引\n",
    "                 3、threshVal——阈值\n",
    "                 4、threshIneq——比较方法（<，≥）\n",
    "    输出：retArray——分类结果\n",
    "    \"\"\"\n",
    "    retArray = ones((shape(dataMatrix)[0],1))                           # 初始化为全1向量\n",
    "    if threshIneq == 'lt':                                                                  # ‘lt‘：less than，小于；‘gt‘：greater than，大于\n",
    "        retArray[dataMatrix[:,dimen] <= threshVal] = -1.0        # 特征与阈值满足threshIneq关系的，分为-1类\n",
    "    else:\n",
    "        retArray[dataMatrix[:,dimen] > threshVal] = -1.0 \n",
    "    return retArray\n",
    "    \n",
    "\"\"\" 构建单层决策树 \"\"\"\n",
    "def buildStump(dataArr,classLabels,D):\n",
    "    \"\"\" \n",
    "    输入：1、dataArr——特征矩阵（m行xn列）\n",
    "                 2、classLabels——标签向量\n",
    "                 3、D——权向量\n",
    "    输出：1、bestStump——最佳单层决策树（保存为字典）\n",
    "                 2、minError——最小误差\n",
    "                 3、bestClasEst——最佳分类结果\n",
    "    \"\"\"\n",
    "    dataMatrix = mat(dataArr); labelMat = mat(classLabels).T\n",
    "    m,n = shape(dataMatrix)\n",
    "    numSteps = 10.0; bestStump = {}; bestClasEst = mat(zeros((m,1)))\n",
    "    minError = inf       # 初始化误差和为正无穷\n",
    "    for i in range(n):   # 遍历所有特征\n",
    "        rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max();    # 第i个特征大最小值和最大值\n",
    "        stepSize = (rangeMax-rangeMin)/numSteps                                               # 步长\n",
    "        for j in range(-1,int(numSteps)+1):                                                               # 在当前特征遍历步长\n",
    "            for inequal in ['lt', 'gt']:  # 在大于、小于间切换\n",
    "                threshVal = (rangeMin + float(j) * stepSize)                                        # 设置阈值\n",
    "                predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal)    # 依据当前特征进行分类\n",
    "                errArr = mat(ones((m,1)))\n",
    "                errArr[predictedVals == labelMat] = 0\n",
    "                weightedError = D.T*errArr                         # 计算误差和\n",
    "                #print \"split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f\" % (i, threshVal, inequal, weightedError)\n",
    "                if weightedError < minError:\n",
    "                    minError = weightedError\n",
    "                    bestClasEst = predictedVals.copy()\n",
    "                    bestStump['dim'] = i                            # 单层决策树，树桩节点为（特征索引、阈值和比较方法）\n",
    "                    bestStump['thresh'] = threshVal\n",
    "                    bestStump['ineq'] = inequal\n",
    "    return bestStump,minError,bestClasEst\n",
    "\n",
    "\"\"\" 训练AdaBoost \"\"\"\n",
    "def adaBoostTrainDS(dataArr,classLabels,numIt=40):\n",
    "    \"\"\" \n",
    "    输入：1、dataArr——特征矩阵（m行xn列）\n",
    "                 2、classLabels——标签向量\n",
    "                 3、numIt——迭代次数\n",
    "    输出：1、weakClassArr——分类器参数\n",
    "                 2、aggClassEst——每个数据点的类别估计累计值\n",
    "    \"\"\"\n",
    "    weakClassArr = []\n",
    "    m = shape(dataArr)[0]\n",
    "    D = mat(ones((m,1))/m)   # 初始化所有权重为1/m\n",
    "    aggClassEst = mat(zeros((m,1)))\n",
    "    for i in range(numIt):\n",
    "        bestStump,error,classEst = buildStump(dataArr,classLabels,D)    # 构建单层决策树 \n",
    "        alpha = float(0.5*log((1.0-error)/max(error,1e-16)))                         # 计算alpha\n",
    "        bestStump['alpha'] = alpha \n",
    "        weakClassArr.append(bestStump)               # 保存分类器参数\n",
    "        expon = multiply(-1*alpha*mat(classLabels).T,classEst)            #  -alpha×yi×Gm\n",
    "        D = multiply(D,exp(expon))                              # 更新权向量\n",
    "        D = D/D.sum()\n",
    "        #  计算所有分类器的误差和，若为0，结束循环\n",
    "        aggClassEst += alpha*classEst              # 基本分类器的线性组合\n",
    "        aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T,ones((m,1)))\n",
    "        errorRate = aggErrors.sum()/m\n",
    "        print(\"total error: \",errorRate) \n",
    "        if errorRate == 0.0: break\n",
    "    return weakClassArr,aggClassEst\n",
    "\n",
    "\"\"\" 使用AdaBoost进行分类\"\"\"\n",
    "def adaClassify(datToClass,classifierArr):\n",
    "    \"\"\" \n",
    "    输入：1、datToClass——待分类样本\n",
    "                 2、classifierArr——AdaBoost分类器\n",
    "    输出：sign(aggClassEst)——\n",
    "    \"\"\"\n",
    "    dataMatrix = mat(datToClass)\n",
    "    m = shape(dataMatrix)[0]\n",
    "    aggClassEst = mat(zeros((m,1)))\n",
    "    for i in range(len(classifierArr)):\n",
    "        classEst = stumpClassify(dataMatrix,classifierArr[i]['dim'],\\\n",
    "                                 classifierArr[i]['thresh'],\\\n",
    "                                 classifierArr[i]['ineq'])                                                 # 使用AdaBoost分类器信息进行决策树分类\n",
    "        aggClassEst += classifierArr[i]['alpha']*classEst\n",
    "#         print(aggClassEst) \n",
    "    return sign(aggClassEst)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "从疝气病马预测死亡率"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total error:  0.2842809364548495\n",
      "total error:  0.2842809364548495\n",
      "total error:  0.24749163879598662\n",
      "total error:  0.24749163879598662\n",
      "total error:  0.25418060200668896\n",
      "total error:  0.2408026755852843\n",
      "total error:  0.2408026755852843\n",
      "total error:  0.22073578595317725\n",
      "total error:  0.24749163879598662\n",
      "total error:  0.23076923076923078\n"
     ]
    }
   ],
   "source": [
    "dataArr, labelArr = loadDataSet('horseColicTraining2.txt')\n",
    "classifierArr, aggClassEst = adaBoostTrainDS(dataArr, labelArr, 10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "testArr, testLabelArr = loadDataSet('horseColicTest2.txt')\n",
    "prediction = adaClassify(testArr,classifierArr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "错误率为： 0.23880597014925373\n"
     ]
    }
   ],
   "source": [
    "errArr = mat(ones((len(testLabelArr),1)))\n",
    "errRate = errArr[prediction!=mat(testLabelArr).T].sum()/len(testLabelArr)\n",
    "print('错误率为：',errRate)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
